from dotenv import load_dotenv
load_dotenv("dev.env")
import os
db_name = os.getenv("db_name")
db_username = os.getenv("db_username")
db_password = os.getenv("db_password")
db_host = os.getenv("db_host")
db_port = os.getenv("db_port")
website = os.getenv("website")
city_link = os.getenv("city_link")
chrome_path = os.getenv("chrome_path")
selenium_chrome_driver_path = os.getenv("selenium_chrome_driver_path")
First, we are loading environment variables, this is not required for non-professional projects but it is a good practice to hide your private variables that contains database credentials which might make your server vulnerable.
import psycopg2
import time
import math
import numpy as np
import os.path
import pandas as pd
import re
import pycld2
import nltk
import joblib
import warnings
import urllib.request
import io
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import keras
import sys
import sweetviz as sv
import webbrowser
import gc
import matplotlib.image as mpimg
import shap
import seaborn as sns
from datetime import date, timedelta, datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDRegressor, ElasticNet, BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.kernel_ridge import KernelRidge
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn import mixture
from sklearn.metrics import accuracy_score, f1_score, precision_score, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from PIL import Image
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD, RMSprop
from keras.utils import np_utils
from sklearn import metrics
from sklearn.utils import shuffle
from wordcloud import WordCloud
from zemberek import TurkishSentenceNormalizer, TurkishMorphology
from threading import Timer
from scipy import stats
#from IPython.display import display, Image
nltk.download('stopwords', quiet=True)
stemmer = TurkishStemmer()
conn_string = 'host={pghost} port={pgport} dbname={pgdatabase} user={pguser} password={pgpassword}'.format(pgdatabase=db_name,pguser=db_username,pgpassword=db_password,pghost=db_host,pgport=db_port)
conn=psycopg2.connect(conn_string)
cur=conn.cursor()
options = Options()
options.binary_location = chrome_path
warnings.filterwarnings('ignore')
def check_if_table_exists(schema,table):
cur.execute("select exists(select * from information_schema.tables where table_schema='{schema}' AND table_name='{table}')".format(schema=schema, table=table))
return cur.fetchone()[0]
def check_if_index_exists(index):
cur.execute("SELECT EXISTS(SELECT * FROM PG_CLASS WHERE relname = '{index}')".format(index=index))
return cur.fetchone()[0]
def check_if_file_exists(filename):
return os.path.isfile(filename)
def execute_mogrify(conn, df, schema, table):
tuples = [tuple(x) for x in df.to_numpy()]
cols = '"'+'","'.join(list(df.columns))+'"'
cursor = conn.cursor()
try:
for tup in tuples:
query = """INSERT INTO "{schema}"."{table}"({cols}) VALUES ({values}) ON CONFLICT DO NOTHING""".format(schema=schema,table=table, cols=cols, values=",".join(map(str,tup)))
cursor.execute(query)
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print("Error: %s" % error)
conn.rollback()
cursor.close()
return 1
cursor.close()
def df_column_conversation(df, column_name, type):
if(type == 'timestamp'):
df[column_name] = df[column_name].apply(lambda x: f"'{x}'::timestamp")
if(type == 'text'):
df[column_name] = df[column_name].str.replace("'","").apply(lambda x: f"'{x}'")
if(type == 'date'):
df[column_name] = df[column_name].apply(lambda x: f"'{x}'::date")
if(type == 'numeric'):
df[column_name] = df[column_name].apply(str).str.replace(',','.')
if(type == 'integer'):
df[column_name] = df[column_name].apply(str).str.replace(',','.').apply(float).astype('Int64').apply(str)
def clean_text(text):
text = text.lower()
text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", text)
return text
def webpage_scroll_down(driver, scroll_speed):
total_height = int(driver.execute_script("return document.body.scrollHeight"))
for i in range(1, total_height, scroll_speed):
driver.execute_script("window.scrollTo(0, {});".format(i))
def byte_image_to_numpy_array(byte_image, image_size):
return cv2.resize(np.array(Image.open(io.BytesIO(byte_image))), (image_size, image_size))
def analyze_data(outputHTML,upToDateCheck,schema,table):
if not((check_if_file_exists(outputHTML) and upToDateCheck)):
sql_command = """
SELECT *
FROM "{schema}"."{table}";
""".format(schema=schema, table=table)
df = pd.read_sql(sql_command,conn)
eda = sv.analyze(df)
eda.show_html(outputHTML, open_browser=False)
del df
gc.collect()
#eda_sales.show_notebook()
webbrowser.open(outputHTML,new=2)
print('You can see the exploratory data analysis visualizations in your browser.')
def analyze_dataframe(outputHTML,upToDateCheck,df):
if not((check_if_file_exists(outputHTML) and upToDateCheck)):
eda = sv.analyze(df)
eda.show_html(outputHTML, open_browser=False)
del df
gc.collect()
#eda_sales.show_notebook()
webbrowser.open(outputHTML,new=2)
print('You can see the exploratory data analysis visualizations in your browser.')
def feature_importance_plot(label, model_name, feature_columns, feature_importance):
plt.title('{label} {model_name} Feature Importance'.format(label=label.title(), model_name=model_name))
plt.barh(feature_columns, model.feature_importances_)
plt.tight_layout()
plt.autoscale()
plt.savefig('images/{label}.png'.format(label=label))
plt.close()
def merge_images(image_paths, orientation='horizontal', background_color='WHITE'):
if(type(image_paths[0])!=Image.Image):
images = [Image.open(x) for x in image_paths]
else:
images = image_paths
min_shape = sorted([(np.sum(i.size), i.size) for i in images])[0][1]
if(orientation=='horizontal'):
images_combined = np.hstack((np.asarray(i.resize(min_shape)) for i in images))
if(orientation=='vertical'):
images_combined = np.vstack((np.asarray(i.resize(min_shape)) for i in images))
merged_im = Image.fromarray(images_combined)
merged_image = merged_im
if(background_color!=''):
merged_image = Image.new("RGBA", merged_im.size, background_color)
merged_image.paste(merged_im, (0, 0), merged_im)
return merged_image
def turkish_to_english_characters(text):
tr_en_chars = {'ı':'i', 'ğ':'g', 'İ':'I', 'Ğ':'G', 'ç':'c', 'Ç':'C', 'ş':'s', 'Ş':'S', 'ö':'o', 'Ö':'O', 'ü':'u', 'Ü':'U'}
for tr, en in tr_en_chars.items():
text = text.replace(tr, en)
return text
morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
def normalize_text(text, timeout=10):
try:
Timer(timeout, sys.exit()).start()
return normalizer.normalize(text)
except:
return text
def stem_text(text):
words = text.split()
stemmed_words = ([stemmer.stem(word) for word in words])
return ' '.join(stemmed_words)
def read_txt_file(location, encoding, column_names):
with open(location, encoding=encoding) as file:
lines = file.readlines()
lines = [line.rstrip() for line in lines]
return pd.DataFrame(lines, columns=column_names)
positive_words_df = read_txt_file('positive_words_tr.txt','utf-8',['WORD'])
negative_words_df = read_txt_file('negative_words_tr.txt','utf-8',['WORD'])
sentiment_words_list = pd.concat([positive_words_df, negative_words_df])['WORD'].values.tolist()
def sentiment_text(text):
sentiment_value = 0
try:
text = text.lower()
word_list = text.split()
for word in word_list:
if word not in(sentiment_words_list):
word_list.remove(word)
for word in word_list:
if word in positive_words_df["WORD"].values:
sentiment_value+= 1
text = text.replace(word, "")
if word in negative_words_df["WORD"].values:
sentiment_value-= 1
text = text.replace(word, "")
finally:
return sentiment_value
def kstest_result(df, column_name, distribution='norm'):
df_name = [x for x in globals() if globals()[x] is df][0]
kstest = stats.kstest(df[column_name][pd.notnull(df[column_name])].values, distribution)
return "Kolmogorov-Smirnov test in {df_name} for {column_name} statistic: {statistic_value:0.4f}, p value: {p_value:0.4f}".format(df_name=df_name, column_name=column_name.capitalize(), statistic_value=kstest[0], p_value=kstest[1])
def distribution_plot(df, column_name):
df_name = [x for x in globals() if globals()[x] is df][0]
df = df[pd.notnull(df[column_name])]
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw= {"height_ratios": (0.2, 1)})
mean=df[column_name].mean()
median=df[column_name].median()
mode=df[column_name].mode().values[0]
sns.boxplot(data=df, x=column_name, ax=ax_box)
ax_box.axvline(mean, color='r', linestyle='--')
ax_box.axvline(median, color='g', linestyle='-')
ax_box.axvline(mode, color='b', linestyle='-')
sns.histplot(data=df, x=column_name, ax=ax_hist, kde=True)
ax_hist.axvline(mean, color='r', linestyle='--', label="Mean")
ax_hist.axvline(median, color='g', linestyle='-', label="Median")
ax_hist.axvline(mode, color='b', linestyle='-', label="Mode")
ax_hist.legend()
ax_box.set(xlabel='')
plt.savefig('images/distribution_plot_{df_name}_{column_name}.png'.format(df_name=df_name, column_name=column_name))
plt.close()
2021-09-01 03:54:24,442 - zemberek.morphology.turkish_morphology - INFO Msg: TurkishMorphology instance initialized in 8.647772312164307
We are defining our functions and importing libraries which will be used in the next steps.
if(check_if_table_exists('ODS','EXT_FB_RESTAURANT')):
print('Table ODS.EXT_FB_RESTAURANT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_RESTAURANT"
(
"RESTAURANT_ID" text NOT NULL,
"RESTAURANT_NAME" text,
"RESTAURANT_LINK" text,
"DATE" date,
CONSTRAINT "RESTAURANT_ID" UNIQUE ("RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_RESTAURANT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_MENU')):
print('Table ODS.EXT_FB_MENU already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_MENU"
(
"PRODUCT_ID" text NOT NULL,
"RESTAURANT_ID" text,
"CATEGORY_NAME" text,
"PRODUCT_NAME" text,
"PRODUCT_DESCRIPTION" text,
"PRODUCT_LISTED_PRICE" text,
"PRODUCT_PRICE" text,
"DISCOUNT" boolean,
"DESIGN_TYPE" text,
"DATE" date,
CONSTRAINT "PRODUCT_ID" UNIQUE ("PRODUCT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_MENU created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_COMMENT')):
print('Table ODS.EXT_FB_COMMENT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_COMMENT"
(
"RESTAURANT_ID" text,
"USERNAME" text,
"COMMENT_TEXT" text,
"COMMENT_DATE" text,
"SPEED" text,
"SERVING" text,
"FLAVOUR" text,
"DATE" date,
CONSTRAINT "UNIQUE_COMMENTS" UNIQUE ("RESTAURANT_ID", "USERNAME", "COMMENT_TEXT")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_COMMENT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_PRODUCT_IMAGE')):
print('Table ODS.EXT_FB_PRODUCT_IMAGE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_PRODUCT_IMAGE"
(
"PRODUCT_ID" text,
"RESTAURANT_ID" text,
"IMAGE_LINK" text,
"DATE" date,
CONSTRAINT "UNIQUE_IMAGES" UNIQUE ("PRODUCT_ID", "RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_PRODUCT_IMAGE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_PRODUCT_IMAGE_SOURCE')):
print('Table ODS.EXT_FB_PRODUCT_IMAGE_SOURCE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE"
(
"IMAGE_LINK" text NOT NULL,
"SOURCE" bytea,
CONSTRAINT "UNIQUE_SOURCE" PRIMARY KEY ("IMAGE_LINK")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_PRODUCT_IMAGE_SOURCE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_RESTAURANT_SERVICE')):
print('Table ODS.EXT_FB_RESTAURANT_SERVICE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute(
"""
CREATE TABLE "ODS"."EXT_FB_RESTAURANT_SERVICE"
(
"RESTAURANT_ID" text NOT NULL,
"MINIMUM_AMOUNT" text,
"SERVICE_TIME" text,
"DATE" date,
PRIMARY KEY ("RESTAURANT_ID"),
CONSTRAINT "UNIQUE_SERVICE" UNIQUE ("RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_RESTAURANT_SERVICE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_MENU')):
print('Table EDW.DWH_FB_MENU already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_MENU"
(
"PRODUCT_ID" text NOT NULL,
"RESTAURANT_ID" text,
"CATEGORY_NAME" text,
"PRODUCT_NAME" text,
"PRODUCT_DESCRIPTION" text,
"PRODUCT_LISTED_PRICE" numeric,
"PRODUCT_PRICE" numeric,
"DISCOUNT" boolean,
"DESIGN_TYPE" text,
"DATE" date,
CONSTRAINT "PRODUCT_ID" UNIQUE ("PRODUCT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_MENU created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_COMMENT')):
print('Table EDW.DWH_FB_COMMENT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_COMMENT"
(
"RESTAURANT_ID" text,
"USERNAME" text,
"COMMENT_TEXT" text,
"COMMENT_DATE" date,
"SPEED" integer,
"SERVING" integer,
"FLAVOUR" integer,
"DATE" date,
CONSTRAINT "UNIQUE_COMMENTS" UNIQUE ("RESTAURANT_ID", "USERNAME", "COMMENT_TEXT")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_COMMENT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_RESTAURANT_SERVICE')):
print('Table EDW.DWH_FB_RESTAURANT_SERVICE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_RESTAURANT_SERVICE"
(
"RESTAURANT_ID" text,
"MINIMUM_AMOUNT" numeric,
"MINIMUM_SERVICE_TIME" integer,
"MAXIMUM_SERVICE_TIME" integer,
"DATE" date,
CONSTRAINT "UNIQUE_SERVICE" PRIMARY KEY ("RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_RESTAURANT_SERVICE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_RESTAURANT')):
print('Table EDW.DWH_FB_RESTAURANT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_RESTAURANT"
(
"RESTAURANT_ID" text,
"RESTAURANT_NAME" text,
"RESTAURANT_BRAND" text,
"RESTAURANT_DISTRICT" text,
"RESTAURANT_STREET" text,
"RESTAURANT_LINK" text,
"DATE" date,
CONSTRAINT "UNIQUE_RESTAURANT" UNIQUE ("RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_RESTAURANT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_COMMENT_SENTIMENT')):
print('Table EDW.DWH_FB_COMMENT_SENTIMENT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_COMMENT_SENTIMENT"
(
"RESTAURANT_ID" text,
"USERNAME" text,
"COMMENT_TEXT" text,
"SENTIMENT_SCORE" integer,
CONSTRAINT "UNIQUE_SENTIMENT" UNIQUE ("RESTAURANT_ID", "USERNAME", "COMMENT_TEXT")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_COMMENT_SENTIMENT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
cur.execute("""
CREATE OR REPLACE FUNCTION public.try_cast(_in text, INOUT _out anyelement)
LANGUAGE 'plpgsql'
AS $BODY$
BEGIN
EXECUTE format('SELECT %L::%s', $1, pg_typeof(_out))
INTO _out;
EXCEPTION WHEN others THEN
-- do nothing: _out already carries default
END
$BODY$;
""")
cur.execute('COMMIT;')
Table ODS.EXT_FB_RESTAURANT already exists. Table ODS.EXT_FB_MENU already exists. Table ODS.EXT_FB_COMMENT already exists. Table ODS.EXT_FB_PRODUCT_IMAGE already exists. Table ODS.EXT_FB_PRODUCT_IMAGE_SOURCE already exists. Table ODS.EXT_FB_RESTAURANT_SERVICE already exists. Table EDW.DWH_FB_MENU already exists. Table EDW.DWH_FB_COMMENT already exists. Table EDW.DWH_FB_RESTAURANT_SERVICE already exists. Table EDW.DWH_FB_RESTAURANT already exists. Table EDW.DWH_FB_COMMENT_SENTIMENT already exists.
Creating tables that will contain our data and creating database functions.
cur.execute("""
WITH DATES AS(
SELECT
MAX("DATE") AS "DATE"
FROM "ODS"."EXT_FB_MENU" EFM
UNION ALL
SELECT
MAX("DATE") AS "DATE"
FROM "ODS"."EXT_FB_COMMENT" EFC
)
SELECT
MAX("DATE") AS "LAST_EXECUTION_DATE"
FROM DATES;
""")
last_execution_date = cur.fetchone()[0]
last_execution_date
datetime.date(2021, 6, 11)
Getting last execution dates in the tables to not import the same data we collected for performance.
restaurant_list = []
end_date = min(date(2021,6,11),(date.today() - timedelta(days=1)))
driver = webdriver.Chrome(options=options, executable_path=selenium_chrome_driver_path)
if(last_execution_date < end_date):
driver.get(city_link)
time.sleep(5)
for i in range(25):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
city_restaurant_groups = driver.find_elements_by_class_name("restaurant-main-info")
for restaurant in city_restaurant_groups:
restaurant_name = restaurant.find_element_by_class_name("restaurant-display-name").text
restaurant_name = restaurant_name.replace("YENİ ", "")
restaurant_link = restaurant.find_element_by_class_name("restaurant-display-name").find_element_by_xpath(".//a").get_attribute('href')
restaurant_id = restaurant_link.split("/")[-1]
if(len(restaurant_link) < 2):
continue
restaurant_list.append([restaurant_id,restaurant_name,restaurant_link])
restaurant_df = pd.DataFrame(restaurant_list, columns=["RESTAURANT_ID","RESTAURANT_NAME","RESTAURANT_LINK"])
df_column_conversation(restaurant_df, 'RESTAURANT_ID', 'text')
df_column_conversation(restaurant_df, 'RESTAURANT_NAME', 'text')
df_column_conversation(restaurant_df, 'RESTAURANT_LINK', 'text')
restaurant_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,restaurant_df,"ODS","EXT_FB_RESTAURANT")
Scraping the website to gather restaurant lists and their links, in this part we need to scroll down to load all the restaurants.
sql_command = """
SELECT
"RESTAURANT_ID"
FROM "ODS"."EXT_FB_RESTAURANT" EFR
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_MENU" EFM WHERE EFR."RESTAURANT_ID" = EFM."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
restaurant_link = "{website}/{sublink}".format(website=website,sublink=sublink)
driver.get(restaurant_link)
time.sleep(5)
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
menu = driver.find_element_by_xpath('//*[@id="restaurant_menu"]')
categories = menu.find_elements_by_xpath('//*[contains(@id,"menu_")]')
menu_list = []
for category in categories:
category_name = category.find_element_by_xpath(".//b").text
for product in category.find_elements_by_xpath(".//div[2]/ul/li"):
try:
design_type = "list"
try:
product_id = product.find_elements_by_class_name("getProductDetail")[-1].get_attribute('data-product-id')
product_name = product.find_elements_by_class_name("getProductDetail")[-1].text
except:
product_id = product.find_element_by_xpath(".//strong").get_attribute('data-product-id')
product_name = product.find_element_by_xpath(".//strong").text
design_type = "card"
try:
product_description = product.find_element_by_class_name("product-desc").text
product_price = product.find_element_by_class_name("price").text
except:
product_description = product.find_element_by_class_name("productInfo").text
product_price = product.find_element_by_class_name("newPrice").text
if(not(design_type=="card")):
design_type = "box"
discount = "TRUE"
try:
if(design_type=="list"):
product_listed_price = product.find_element_by_class_name("listed-price").text
if(design_type in ["card","box"]):
product_listed_price = product.find_element_by_class_name("listedPrice").text
except:
product_listed_price = product_price
discount = "FALSE"
menu_list.append([product_id,sublink,category_name,product_name,product_description,product_listed_price,product_price,discount,design_type])
except:
continue
menu_df = pd.DataFrame(menu_list, columns=["PRODUCT_ID","RESTAURANT_ID","CATEGORY_NAME","PRODUCT_NAME","PRODUCT_DESCRIPTION","PRODUCT_LISTED_PRICE","PRODUCT_PRICE","DISCOUNT","DESIGN_TYPE"])
menu_df = menu_df[menu_df['PRODUCT_ID'].str.len() > 0]
menu_df = menu_df[menu_df['PRODUCT_NAME'].str.len() > 0]
df_column_conversation(menu_df, 'PRODUCT_ID', 'text')
df_column_conversation(menu_df, 'RESTAURANT_ID', 'text')
df_column_conversation(menu_df, 'CATEGORY_NAME', 'text')
df_column_conversation(menu_df, 'PRODUCT_NAME', 'text')
df_column_conversation(menu_df, 'PRODUCT_DESCRIPTION', 'text')
df_column_conversation(menu_df, 'PRODUCT_LISTED_PRICE', 'text')
df_column_conversation(menu_df, 'PRODUCT_PRICE', 'text')
df_column_conversation(menu_df, 'DESIGN_TYPE', 'text')
menu_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,menu_df,"ODS","EXT_FB_MENU")
menu_df
Looping through all of the restaurants and scraping their products.
sql_command = """
SELECT
"RESTAURANT_ID"
FROM "ODS"."EXT_FB_RESTAURANT" EFR
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_COMMENT" EFC WHERE EFR."RESTAURANT_ID" = EFC."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df = restaurant_df.sample(frac=1)
restaurant_df
| RESTAURANT_ID | |
|---|---|
| 51 | al-ye-kadikoy-dumlupinar-mah-istanbul |
| 21 | familys-cafe-eyup-gokturk-merkez-mah-istanbul |
| 1 | pizzeria-eyup-gokturk-merkez-mah-istanbul |
| 49 | fennos-cafe-besiktas-cihannuma-mah-istanbul |
| 12 | antica-pizzeria-by-antica-locanda-besiktas-arn... |
| ... | ... |
| 35 | boysan-pizza-esenyurt-mehtercesme-mah-istanbul |
| 9 | fonseca-pizza-besiktas-dikilitas-mah-istanbul |
| 7 | salon-kalamis-kadikoy-fenerbahce-mah-istanbul |
| 27 | cali-kahvesi-uskudar-murat-reis-mah-istanbul |
| 73 | ice-munchies-avcilar-cihangir-mah-istanbul |
79 rows × 1 columns
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
last_comment_page_url = "{website}/{sublink}?section=comments&page=9999".format(website=website,sublink=sublink)
driver.get(last_comment_page_url)
time.sleep(0.1)
comments_list = []
if(sublink not in driver.current_url):
continue
last_comment_page_redirect_url = driver.current_url
last_comment_page_number = int(last_comment_page_redirect_url.replace("&status=closed","").replace("{website}/{sublink}?section=comments&page=".format(website=website,sublink=sublink),""))
for page_number in range(1, last_comment_page_number+1):
current_comment_page_url = "{website}/{sublink}?section=comments&page={page_number}".format(website=website,sublink=sublink,page_number=page_number)
driver.get(current_comment_page_url)
time.sleep(1)
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
try:
driver.find_element(By.XPATH, '//*[@id="alternative-restaurant-popup"]/div[1]/div[2]/img').click(); #Closing pop-up
except Exception:
pass
#driver.find_element(By.XPATH, '//*[@id="restaurantDetail"]/div[2]/div[1]/ul/li[4]/a').click(); #Clicking comments
comment_list = driver.find_elements_by_class_name("comments-body")
for comment in comment_list:
try:
username = comment.find_element_by_class_name("userName").text
comment_text = comment.find_element_by_xpath('.//p').text
comment_date = comment.find_element_by_class_name("commentDate").text
except NoSuchElementException:
continue
try:
speed = comment.find_element_by_class_name("speed").text
except NoSuchElementException:
speed = ""
try:
serving = comment.find_element_by_class_name("serving").text
except NoSuchElementException:
serving = ""
try:
flavour = comment.find_element_by_class_name("flavour").text
except NoSuchElementException:
flavour = ""
comments_list.append([sublink, username, comment_text, comment_date, speed, serving, flavour])
comment_df = pd.DataFrame(comments_list, columns=["RESTAURANT_ID","USERNAME","COMMENT_TEXT","COMMENT_DATE","SPEED","SERVING","FLAVOUR"])
df_column_conversation(comment_df, 'RESTAURANT_ID', 'text')
df_column_conversation(comment_df, 'USERNAME', 'text')
df_column_conversation(comment_df, 'COMMENT_TEXT', 'text')
df_column_conversation(comment_df, 'COMMENT_DATE', 'text')
df_column_conversation(comment_df, 'SPEED', 'text')
df_column_conversation(comment_df, 'SERVING', 'text')
df_column_conversation(comment_df, 'FLAVOUR', 'text')
comment_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,comment_df,"ODS","EXT_FB_COMMENT")
comment_df
Looping through all of the restaurants and scraping their reviews and given scores for each comment.
sql_command = """
SELECT DISTINCT
"RESTAURANT_ID",
"DESIGN_TYPE"
FROM "ODS"."EXT_FB_MENU" EFM
WHERE 1=1
AND EFM."DESIGN_TYPE" <> 'list'
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_PRODUCT_IMAGE" EFPI WHERE EFM."RESTAURANT_ID" = EFPI."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df
| RESTAURANT_ID | DESIGN_TYPE | |
|---|---|---|
| 0 | the-boss-pizza-burger-sariyer-maden-mah-istanbul | card |
| 1 | 444-pizza-bagcilar-demirkapi-mah-istanbul | card |
| 2 | mezzaluna-paket-beykoz-acarlar-mah-acarkent-is... | card |
| 3 | kafein-plus-kucukcekmece-tevfikbey-mah-sefakoy... | card |
| 4 | mayk-cafe-sisli-fulya-mah-istanbul | card |
| ... | ... | ... |
| 208 | amigos-burger-pizza-kagithane-merkez-mah-istanbul | card |
| 209 | konak-firin-avcilar-merkez-mah-istanbul | card |
| 210 | keyifle-pide-lahmacun-bakirkoy-atakoy-6-kisim-... | card |
| 211 | havelka-kadikoy-caddebostan-mah-istanbul | card |
| 212 | heros-pizza-esenyurt-fatih-mah-istanbul | card |
213 rows × 2 columns
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
design_type = restaurant_df.loc[i,"DESIGN_TYPE"]
page_url = "{website}/{sublink}".format(website=website,sublink=sublink)
driver.get(page_url)
time.sleep(0.1)
webpage_scroll_down(driver, 15)
if(sublink not in driver.current_url):
continue
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
try:
driver.find_element(By.XPATH, '//*[@id="alternative-restaurant-popup"]/div[1]/div[2]/img').click(); #Closing pop-up
except Exception:
pass
menu = driver.find_element_by_xpath('//*[@id="restaurant_menu"]')
categories = menu.find_elements_by_xpath('//*[contains(@id,"menu_")]')
image_list = []
for product in menu.find_elements_by_class_name("product-image"):
try:
product_id = product.get_attribute('data-product-id')
image_link = product.find_element_by_xpath(".//img").get_attribute('src')
if("www.yemeksepeti.com/assets/images/" in image_link):
continue
image_list.append([product_id,sublink,image_link])
except:
pass
image_df = pd.DataFrame(image_list, columns=["PRODUCT_ID","RESTAURANT_ID","IMAGE_LINK"])
df_column_conversation(image_df, 'PRODUCT_ID', 'text')
df_column_conversation(image_df, 'RESTAURANT_ID', 'text')
df_column_conversation(image_df, 'IMAGE_LINK', 'text')
image_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,image_df,"ODS","EXT_FB_PRODUCT_IMAGE")
sql_command = """
SELECT DISTINCT
"RESTAURANT_ID"
FROM "ODS"."EXT_FB_MENU" EFM
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_RESTAURANT_SERVICE" EFRS WHERE EFM."RESTAURANT_ID" = EFRS."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df
| RESTAURANT_ID | |
|---|---|
| 0 | adrians-pizza-sultanbeyli-mehmet-akif-mah-ista... |
| 1 | afili-pizza-kadikoy-rasimpasa-mah-istanbul |
| 2 | antica-pizzeria-by-antica-locanda-besiktas-arn... |
| 3 | aron-pizza-atasehir-barbaros-mah-istanbul |
| 4 | ayicik-pizza-tatli-besiktas-dikilitas-mah-ista... |
| 5 | baca-pizzeria-sariyer-maden-mah-istanbul |
| 6 | balcoon-cafe-uskudar-kisikli-mah-istanbul |
| 7 | bee-pizza-esenler-davutpasa-mah-istanbul |
| 8 | big-slice-pizza-maltepe-baglarbasi-mah-istanbul |
| 9 | duses-pizza-esenyurt-ucevler-mah-istanbul |
| 10 | emporio-pizza-pasta-besiktas-levent-mah-istanbul |
| 11 | f-chef-italiano-pizza-pasta-sisli-halaskargazi... |
| 12 | hamurkeyf-fatih-cerrahpasa-mah-istanbul |
| 13 | heros-pizza-besiktas-dikilitas-mah-istanbul |
| 14 | little-caesars-pizza-bayrampasa-yenidogan-mah-... |
| 15 | little-caesars-pizza-fatih-aksaray-mah-istanbul |
| 16 | little-caesars-pizza-sariyer-maden-mah-istanbul |
| 17 | mandalin-mercan-tuzla-postane-mah-istanbul |
| 18 | mistic-pizza-burger-cafe-esenler-havaalani-mah... |
| 19 | panda-pizza-esenyurt-guzelyurt-mah-istanbul |
| 20 | pastavicio-makarna-pizza-kadikoy-caferaga-mah-... |
| 21 | pizza-bulls-sultangazi-ugur-mumcu-mah-istanbul |
| 22 | pizza-hays-cafe-pendik-velibaba-mah-istanbul |
| 23 | pizza-hut-atasehir-kucukbakkalkoy-mah-istanbul |
| 24 | pizza-hut-besiktas-levazim-mah-istanbul |
| 25 | pizza-hut-pendik-yenisehir-mah-kurtkoy-istanbul |
| 26 | pizza-hut-sariyer-ayazaga-mah-vadi-istanbul-bo... |
| 27 | pizza-hut-sisli-esentepe-mah-ozdilek-istanbul |
| 28 | pizza-station-kagithane-seyrantepe-mah-istanbul |
| 29 | pizzeria-eyup-gokturk-merkez-mah-istanbul |
| 30 | sbarro-pizza-atasehir-kucukbakkalkoy-mah-istanbul |
| 31 | sbarro-pizza-bakirkoy-osmaniye-mah-istanbul |
| 32 | sbarro-pizza-basaksehir-ziya-gokalp-mah-istanbul |
| 33 | sbarro-pizza-esenyurt-guzelyurt-mah-istanbul |
| 34 | sbarro-pizza-esenyurt-incirtepe-mah-istanbul |
| 35 | sbarro-pizza-esenyurt-saadetdere-mah-istanbul |
| 36 | sbarro-pizza-eyup-aksemsettin-mah-istanbul |
| 37 | sbarro-pizza-eyup-topcular-mah-istanbul |
| 38 | sbarro-pizza-pendik-guzelyali-mah-istanbul |
| 39 | sbarro-pizza-sisli-esentepe-mah-plazalar-istanbul |
| 40 | sbarro-pizza-umraniye-madenler-mah-istanbul |
| 41 | sbarro-pizza-umraniye-yamanevler-mah-istanbul |
| 42 | sbarro-pizza-uskudar-mimar-sinan-mah-istanbul |
| 43 | sehremini-tas-firini-fatih-sehremini-mah-capa-... |
| 44 | sembol-pasta-pide-pizza-esenler-menderes-mah-i... |
| 45 | torcello-ristorante-italiano-besiktas-akat-mah... |
#if(last_execution_date < end_date):
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
restaurant_link = "{website}/{sublink}".format(website=website,sublink=sublink)
driver.get(restaurant_link)
time.sleep(5)
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
minimum_amount = ""
service_time = ""
for shortInfoItem in driver.find_elements_by_class_name('shortInfoItem'):
if("Min. Tutar" in shortInfoItem.text):
minimum_amount = shortInfoItem.text.replace("Min. Tutar\n","").replace(" TL", "").replace(",",".")
if("Servis Süresi" in shortInfoItem.text):
service_time = shortInfoItem.text.replace("Servis Süresi\n","").replace(" dk.", "")
if(minimum_amount == "" and service_time == ""):
continue
service_df = pd.DataFrame([[sublink, minimum_amount, service_time]], columns=["RESTAURANT_ID","MINIMUM_AMOUNT","SERVICE_TIME"])
df_column_conversation(service_df, 'RESTAURANT_ID', 'text')
df_column_conversation(service_df, 'MINIMUM_AMOUNT', 'text')
df_column_conversation(service_df, 'SERVICE_TIME', 'text')
service_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,service_df,"ODS","EXT_FB_RESTAURANT_SERVICE")
driver.quit()
sql_command = """
SELECT DISTINCT
"IMAGE_LINK"
FROM "ODS"."EXT_FB_PRODUCT_IMAGE" EFPI
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE" EFPIS WHERE EFPIS."IMAGE_LINK" = EFPI."IMAGE_LINK");
"""
image_df = pd.read_sql(sql_command,conn)
image_df = image_df.sample(frac=1).reset_index(drop=True)
image_df
| IMAGE_LINK | |
|---|---|
| 0 | https://cdn.yemeksepeti.com/ProductImages/TR_I... |
for i in range(len(image_df)):
page_url = image_df.loc[i,"IMAGE_LINK"]
image_file_name = "images/product.jpg"
try:
urllib.request.urlretrieve(page_url, image_file_name)
except:
continue
image_binary = psycopg2.Binary(open(image_file_name, 'rb').read())
cur.execute("""
INSERT INTO "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE" ("IMAGE_LINK","SOURCE")
VALUES('{image_link}',{image_binary})
ON CONFLICT DO NOTHING;
""".format(image_link=page_url, image_binary=image_binary))
cur.execute('COMMIT;')
Looping through all of the restaurants and scraping their product images for deep learning purposes.
cur.execute("""
INSERT INTO "EDW"."DWH_FB_MENU"
SELECT
EFM."PRODUCT_ID",
EFM."RESTAURANT_ID",
EFM."CATEGORY_NAME",
EFM."PRODUCT_NAME",
EFM."PRODUCT_DESCRIPTION",
REPLACE(REPLACE(EFM."PRODUCT_LISTED_PRICE",' TL',''),',','.')::numeric AS "PRODUCT_LISTED_PRICE",
REPLACE(REPLACE(EFM."PRODUCT_PRICE",' TL',''),',','.')::numeric AS "PRODUCT_PRICE",
EFM."DISCOUNT",
EFM."DESIGN_TYPE",
EFM."DATE"
FROM "ODS"."EXT_FB_MENU" EFM
ON CONFLICT DO NOTHING;
""")
cur.execute('COMMIT;')
cur.execute("""
INSERT INTO "EDW"."DWH_FB_RESTAURANT_SERVICE"
SELECT
EFRS."RESTAURANT_ID",
EFRS."MINIMUM_AMOUNT"::numeric AS "MINIMUM_AMOUNT",
LEFT(EFRS."SERVICE_TIME", STRPOS(EFRS."SERVICE_TIME", '-')-1)::integer AS "MINIMUM_SERVICE_TIME",
RIGHT(EFRS."SERVICE_TIME", STRPOS(EFRS."SERVICE_TIME", '-')-1)::integer AS "MAXIMUM_SERVICE_TIME",
EFRS."DATE"
FROM "ODS"."EXT_FB_RESTAURANT_SERVICE" EFRS
ON CONFLICT DO NOTHING;
""")
cur.execute('COMMIT;')
cur.execute("""
INSERT INTO "EDW"."DWH_FB_RESTAURANT"
WITH STREET AS(
SELECT
EFR."RESTAURANT_ID",
EFR."RESTAURANT_NAME",
SUBSTRING(EFR."RESTAURANT_NAME", '\((.+)\)') AS "RESTAURANT_STREET",
EFR."RESTAURANT_LINK",
EFR."DATE"
FROM "ODS"."EXT_FB_RESTAURANT" EFR
)
SELECT
EFR."RESTAURANT_ID",
EFR."RESTAURANT_NAME",
SPLIT_PART(REPLACE(REPLACE(REPLACE(EFR."RESTAURANT_NAME",COALESCE(EFR."RESTAURANT_STREET",''),''),'(',''),')',''),',',1) AS "RESTAURANT_BRAND",
REVERSE(SPLIT_PART(REVERSE(REPLACE(REPLACE(REPLACE(EFR."RESTAURANT_NAME",COALESCE(EFR."RESTAURANT_STREET",''),''),'(',''),')','')),',',1)) AS "RESTAURANT_DISTRICT",
EFR."RESTAURANT_STREET",
EFR."RESTAURANT_LINK",
EFR."DATE"
FROM STREET EFR
ON CONFLICT DO NOTHING;
""")
cur.execute('COMMIT;')
if(last_execution_date < end_date):
sql_command = """
WITH CLEAN_DATA AS(
SELECT
EFC."RESTAURANT_ID",
EFC."USERNAME",
LOWER(EFC."COMMENT_TEXT") AS "COMMENT_TEXT",
EFC."COMMENT_DATE",
TRY_CAST(REGEXP_REPLACE(EFC."SPEED", '\D','','g'),NULL::INTEGER) AS "SPEED",
TRY_CAST(REGEXP_REPLACE(EFC."SERVING", '\D','','g'),NULL::INTEGER) AS "SERVING",
TRY_CAST(REGEXP_REPLACE(EFC."FLAVOUR", '\D','','g'),NULL::INTEGER) AS "FLAVOUR",
EFC."DATE",
REGEXP_REPLACE(EFC."COMMENT_DATE", '\D','','g')||' '||REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(REPLACE(EFC."COMMENT_DATE",' önce',''), '[^[:alpha:]]', '', 'g'),'ay','month'),'bugün','today'),'gün','day') AS "COMMENT_DATE_INTERVAL"
FROM "ODS"."EXT_FB_COMMENT" EFC
WHERE 1=1
AND EFC."USERNAME" <> 'Yemeksepeti'
)
SELECT
CD."RESTAURANT_ID",
CD."USERNAME",
CD."COMMENT_TEXT",
CASE WHEN CD."COMMENT_DATE_INTERVAL" = ' today' THEN CD."DATE" ELSE CD."DATE" - CAST(CD."COMMENT_DATE_INTERVAL" AS INTERVAL) END::date AS "COMMENT_DATE",
CD."SPEED",
CD."SERVING",
CD."FLAVOUR",
CD."DATE"
FROM CLEAN_DATA CD;
"""
comment_df = pd.read_sql(sql_command,conn)
comment_df['COMMENT_TEXT'] = comment_df['COMMENT_TEXT'].apply(clean_text)
comment_df
df_column_conversation(comment_df, 'RESTAURANT_ID', 'text')
df_column_conversation(comment_df, 'USERNAME', 'text')
df_column_conversation(comment_df, 'COMMENT_TEXT', 'text')
df_column_conversation(comment_df, 'COMMENT_DATE', 'date')
df_column_conversation(comment_df, 'SPEED', 'integer')
df_column_conversation(comment_df, 'SERVING', 'integer')
df_column_conversation(comment_df, 'FLAVOUR', 'integer')
df_column_conversation(comment_df, 'DATE', 'date')
comment_df.replace('<NA>', 'NULL', inplace=True)
execute_mogrify(conn,comment_df,"EDW","DWH_FB_COMMENT")
sql_command = """
SELECT
EFR."RESTAURANT_ID",
EFR."USERNAME",
EFR."COMMENT_TEXT",
EFR."COMMENT_DATE",
EFR."SPEED",
CASE WHEN EFR."SPEED" > (AVG(EFR."SPEED") OVER()) THEN 'True' ELSE 'False' END AS "ABOVE_AVERAGE_SPEED",
EFR."SERVING",
CASE WHEN EFR."SERVING" > (AVG(EFR."SERVING") OVER()) THEN 'True' ELSE 'False' END AS "ABOVE_AVERAGE_SERVING",
EFR."FLAVOUR",
CASE WHEN EFR."FLAVOUR" > (AVG(EFR."FLAVOUR") OVER()) THEN 'True' ELSE 'False' END AS "ABOVE_AVERAGE_FLAVOUR",
EFR."DATE"
FROM "EDW"."DWH_FB_COMMENT" EFR;
"""
comment_df = pd.read_sql(sql_command,conn)
comment_df
| RESTAURANT_ID | USERNAME | COMMENT_TEXT | COMMENT_DATE | SPEED | ABOVE_AVERAGE_SPEED | SERVING | ABOVE_AVERAGE_SERVING | FLAVOUR | ABOVE_AVERAGE_FLAVOUR | DATE | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...o | herzamanki gibi mükemmel ellerinize saglik | 2021-05-26 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 1 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | favori pizzacım genelde madness söylüyordum yo... | 2021-05-25 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 2 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...t | şahane şimdi tekrarlıyorum siparişi | 2021-05-25 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 3 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | hamur seçimi yapılamıyor | 2021-05-24 | 10.0 | True | 9 | True | 7 | False | 2021-06-09 |
| 4 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...1 | pizza kesilmeden gönderilmişti lütfen dikkat e... | 2021-05-24 | 10.0 | True | 8 | False | 10 | True | 2021-06-09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 271838 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | elinize sağlık çok başarılı | 2021-05-27 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 271839 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...i | super | 2021-05-27 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 271840 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...2 | pizzadan anlamayanların beğendiği bir pizza do... | 2021-05-27 | 10.0 | True | 5 | False | 5 | False | 2021-06-09 |
| 271841 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | cok guzeldi bu sefer tesekkurler | 2021-05-26 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 271842 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | hawaii pizza ve yarı yarıya pizzalar harika in... | 2021-05-26 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
271843 rows × 11 columns
sql_command = """
WITH AVERAGE_SCORES AS(
SELECT
DFC."RESTAURANT_ID",
AVG(DFC."SPEED") AS "SPEED",
AVG(DFC."SERVING") AS "SERVING",
AVG(DFC."FLAVOUR") AS "FLAVOUR"
FROM "EDW"."DWH_FB_COMMENT" DFC
WHERE 1=1
AND DFC."SPEED" IS NOT NULL
GROUP BY DFC."RESTAURANT_ID"
HAVING COUNT(*) >= 30
),
MENU AS(
SELECT
EFM."RESTAURANT_ID",
COUNT(*) AS "PRODUCT_COUNT",
SUM(CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END) AS "PIZZA_COUNT",
AVG(CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN EFM."PRODUCT_LISTED_PRICE" ELSE NULL END) AS "AVERAGE_PIZZA_PRICE",
AVG(CASE WHEN EFM."DISCOUNT" THEN 1 ELSE 0 END) AS "DISCOUNT_RATIO"
FROM "EDW"."DWH_FB_MENU" EFM
WHERE 1=1
AND LOWER("CATEGORY_NAME") NOT LIKE '%çecek%'
AND LOWER("CATEGORY_NAME") NOT LIKE '%poşet%'
AND LOWER("CATEGORY_NAME") NOT LIKE '%soslar%'
AND LOWER("CATEGORY_NAME") NOT LIKE '%kahve%'
AND LOWER("CATEGORY_NAME") NOT LIKE '%çay%'
GROUP BY EFM."RESTAURANT_ID"
),
SENTIMENT AS(
SELECT
DFCS."RESTAURANT_ID",
SUM(DFCS."SENTIMENT_SCORE") AS "TOTAL_SENTIMENT_SCORE",
CASE WHEN SUM(DFCS."SENTIMENT_SCORE") = 0 THEN 0 ELSE AVG(DFCS."SENTIMENT_SCORE") END AS "AVERAGE_SENTIMENT_SCORE",
CASE WHEN STDDEV(DFCS."SENTIMENT_SCORE") IS NULL THEN 0 ELSE STDDEV(DFCS."SENTIMENT_SCORE") END AS "STD_SENTIMENT_SCORE"
FROM "EDW"."DWH_FB_COMMENT_SENTIMENT" DFCS
GROUP BY DFCS."RESTAURANT_ID"
)
SELECT
DFR."RESTAURANT_ID",
DFR."RESTAURANT_BRAND",
DFR."RESTAURANT_DISTRICT",
DFR."RESTAURANT_STREET",
DFRS."MINIMUM_AMOUNT",
DFRS."MINIMUM_SERVICE_TIME",
DFRS."MAXIMUM_SERVICE_TIME",
(DFRS."MAXIMUM_SERVICE_TIME"+DFRS."MINIMUM_SERVICE_TIME")/2 AS "EXPECTED_SERVICE_TIME",
SQRT(POWER(DFRS."MAXIMUM_SERVICE_TIME"-DFRS."MINIMUM_SERVICE_TIME",2)/12) AS "STD_SERVICE_TIME",
(SQRT(POWER(DFRS."MAXIMUM_SERVICE_TIME"-DFRS."MINIMUM_SERVICE_TIME",2)/12))/((DFRS."MAXIMUM_SERVICE_TIME"+DFRS."MINIMUM_SERVICE_TIME")/2) AS "COV_SERVICE_TIME",
MENU."PRODUCT_COUNT",
MENU."PIZZA_COUNT",
MENU."AVERAGE_PIZZA_PRICE",
MENU."DISCOUNT_RATIO",
SNT."TOTAL_SENTIMENT_SCORE",
SNT."AVERAGE_SENTIMENT_SCORE",
SNT."STD_SENTIMENT_SCORE",
AVS."SPEED",
AVS."SERVING",
AVS."FLAVOUR"
FROM "EDW"."DWH_FB_RESTAURANT" DFR
INNER JOIN "EDW"."DWH_FB_RESTAURANT_SERVICE" DFRS ON (DFR."RESTAURANT_ID" = DFRS."RESTAURANT_ID")
INNER JOIN AVERAGE_SCORES AVS ON (DFR."RESTAURANT_ID" = AVS."RESTAURANT_ID")
INNER JOIN MENU ON (DFR."RESTAURANT_ID" = MENU."RESTAURANT_ID" AND MENU."PIZZA_COUNT" > 0)
INNER JOIN SENTIMENT SNT ON (DFR."RESTAURANT_ID"= SNT."RESTAURANT_ID");
"""
service_df = pd.read_sql(sql_command,conn)
service_df
| RESTAURANT_ID | RESTAURANT_BRAND | RESTAURANT_DISTRICT | RESTAURANT_STREET | MINIMUM_AMOUNT | MINIMUM_SERVICE_TIME | MAXIMUM_SERVICE_TIME | EXPECTED_SERVICE_TIME | STD_SERVICE_TIME | COV_SERVICE_TIME | PRODUCT_COUNT | PIZZA_COUNT | AVERAGE_PIZZA_PRICE | DISCOUNT_RATIO | TOTAL_SENTIMENT_SCORE | AVERAGE_SENTIMENT_SCORE | STD_SENTIMENT_SCORE | SPEED | SERVING | FLAVOUR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1991-tarihi-ortakoy-kumpircisi-silivri-yeni-ma... | 1991 Tarihi Ortaköy Kumpircisi | Silivri | Yeni Mah. | 30.0 | 20 | 30 | 25 | 2.886751 | 0.115470 | 45 | 10 | 41.600000 | 0.000000 | 66 | 1.081967 | 1.636002 | 8.803279 | 8.606557 | 8.606557 |
| 1 | 400-c-pizza-sisli-tesvikiye-mah-istanbul | 400° C Pizza | Şişli | Teşvikiye Mah. | 0.0 | 35 | 45 | 40 | 2.886751 | 0.072169 | 53 | 48 | 63.645833 | 0.018868 | 49 | 0.418803 | 1.334162 | 8.837607 | 8.350427 | 7.957265 |
| 2 | 400-c-pizza-umraniye-serifali-mah-istanbul | 400° C Pizza | Ümraniye | Şerifali Mah. | 5.0 | 30 | 40 | 35 | 2.886751 | 0.082479 | 43 | 43 | 65.232558 | 0.000000 | 49 | 0.382812 | 1.582986 | 8.398438 | 7.781250 | 8.164062 |
| 3 | 444-pizza-bagcilar-demirkapi-mah-istanbul | 444 Pizza | Bağcılar | Demirkapı Mah. | 0.0 | 35 | 45 | 40 | 2.886751 | 0.072169 | 62 | 40 | 42.382500 | 0.983871 | 62 | 0.273128 | 1.228471 | 7.392070 | 6.885463 | 6.986784 |
| 4 | 444-pizza-eyup-emniyettepe-mah-istanbul | 444 Pizza | Eyüp | Emniyettepe Mah. | 14.9 | 20 | 30 | 25 | 2.886751 | 0.115470 | 75 | 47 | 43.553191 | 0.053333 | 286 | 0.722222 | 1.417293 | 8.454545 | 8.373737 | 8.212121 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 865 | yummy-pizza-beylikduzu-adnan-kahveci-mah-istanbul | Yummy Pizza | Beylikdüzü | Adnan Kahveci Mah. | 20.0 | 30 | 40 | 35 | 2.886751 | 0.082479 | 54 | 49 | 37.508163 | 0.000000 | 82 | 1.188406 | 1.536568 | 9.130435 | 9.202899 | 9.202899 |
| 866 | yummy-pizza-pendik-esenyali-mah-istanbul | Yummy Pizza | Pendik | Esenyalı Mah. | 5.0 | 50 | 60 | 55 | 2.886751 | 0.052486 | 31 | 31 | 24.190323 | 0.096774 | 50 | 1.136364 | 1.374027 | 8.113636 | 7.931818 | 7.636364 |
| 867 | yuz-seksen-gram-pizza-umraniye-ataturk-mah-ist... | Yüz Seksen Gram Pizza | Ümraniye | Atatürk Mah. | 5.0 | 20 | 30 | 25 | 2.886751 | 0.115470 | 21 | 20 | 35.350000 | 0.142857 | 153 | 0.874286 | 1.680234 | 9.874286 | 9.720000 | 9.765714 |
| 868 | zi-burger-pizza-basaksehir-bahcesehir-2-kisim-... | Zi Burger & Pizza | Başakşehir | Bahçeşehir 2. Kısım Mah. | 35.0 | 35 | 45 | 40 | 2.886751 | 0.072169 | 46 | 28 | 43.035714 | 0.065217 | 36 | 0.590164 | 1.487470 | 8.704918 | 8.540984 | 8.557377 |
| 869 | zucca-pizza-sisli-bozkurt-mah-kurtulus-istanbul | Zucca Pizza | Şişli | Bozkurt Mah. - Kurtuluş | 5.0 | 20 | 30 | 25 | 2.886751 | 0.115470 | 64 | 32 | 62.281250 | 0.000000 | 125 | 0.490196 | 1.468468 | 9.400000 | 8.996078 | 8.698039 |
870 rows × 20 columns
stop_words = [element for element in stopwords.words('turkish') if element not in ['çok','eğer','gibi','hiç','niçin','niye','sanki','yani','en','az','birkaç','bazı','aslında','neden','hepsi']]
comment_df['COMMENT_TEXT'] = comment_df['COMMENT_TEXT'].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in (stop_words)]))
#comment_df['COMMENT_TEXT'] = comment_df['COMMENT_TEXT'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
comment_df
| RESTAURANT_ID | USERNAME | COMMENT_TEXT | COMMENT_DATE | SPEED | ABOVE_AVERAGE_SPEED | SERVING | ABOVE_AVERAGE_SERVING | FLAVOUR | ABOVE_AVERAGE_FLAVOUR | DATE | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...o | herzamanki gibi mükemmel ellerinize saglik | 2021-05-26 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 1 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | favori pizzacım genelde madness söylüyordum yo... | 2021-05-25 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 2 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...t | şahane şimdi tekrarlıyorum siparişi | 2021-05-25 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 3 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | hamur seçimi yapılamıyor | 2021-05-24 | 10.0 | True | 9 | True | 7 | False | 2021-06-09 |
| 4 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...1 | pizza kesilmeden gönderilmişti lütfen dikkat e... | 2021-05-24 | 10.0 | True | 8 | False | 10 | True | 2021-06-09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 271838 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | elinize sağlık çok başarılı | 2021-05-27 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 271839 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...i | super | 2021-05-27 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 271840 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...2 | pizzadan anlamayanların beğendiği bir pizza do... | 2021-05-27 | 10.0 | True | 5 | False | 5 | False | 2021-06-09 |
| 271841 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | cok guzeldi sefer tesekkurler | 2021-05-26 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
| 271842 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | hawaii pizza yarı yarıya pizzalar harika incec... | 2021-05-26 | 10.0 | True | 10 | True | 10 | True | 2021-06-09 |
271843 rows × 11 columns
Cleaning our review dataset with lowercasing all the text, removing punctuations, removing some Turkish stopwords, and estimating when the reviews are made, we are also transforming string scores to the integer type.
sql_command = """
SELECT
DFC."RESTAURANT_ID",
DFC."USERNAME",
DFC."COMMENT_TEXT"
FROM "EDW"."DWH_FB_COMMENT" DFC
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "EDW"."DWH_FB_COMMENT_SENTIMENT" DFCS WHERE DFC."RESTAURANT_ID" = DFCS."RESTAURANT_ID" AND DFC."USERNAME" = DFCS."USERNAME" AND DFC."COMMENT_TEXT" = DFCS."COMMENT_TEXT");
"""
sentiment_pre_df = pd.read_sql(sql_command,conn)
rows_per_loop = 100
end_row = len(sentiment_pre_df)+1
for i in range(0, end_row, rows_per_loop):
start_time = math.trunc(time.time())
sentiment_df = sentiment_pre_df[i: i+rows_per_loop]
sentiment_df['SENTIMENT_SCORE'] = sentiment_df['COMMENT_TEXT'].apply(normalize_text).apply(sentiment_text)
df_column_conversation(sentiment_df, 'RESTAURANT_ID', 'text')
df_column_conversation(sentiment_df, 'USERNAME', 'text')
df_column_conversation(sentiment_df, 'COMMENT_TEXT', 'text')
execute_mogrify(conn,sentiment_df,"EDW","DWH_FB_COMMENT_SENTIMENT")
end_time = math.trunc(time.time())
print("Inserted rows between {starting_row} and {ending_row} ({rows_per_loop} rows) in {execute_time} seconds.".format(execute_time=end_time-start_time, starting_row=len(sentiment_df), ending_row=i+rows_per_loop, rows_per_loop=rows_per_loop))
Inserted rows between 0 and 100 (100 rows) in 0 seconds.
analyze_data('eda_restaurant.html',False,'ODS','EXT_FB_RESTAURANT')
analyze_data('eda_menu.html',False,'EDW','DWH_FB_MENU')
analyze_data('eda_product_image.html',False,'ODS','EXT_FB_PRODUCT_IMAGE')
analyze_data('eda_comment.html',False,'EDW','DWH_FB_COMMENT')
analyze_dataframe('eda_service.html',False,service_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME','EXPECTED_SERVICE_TIME','STD_SERVICE_TIME','COV_SERVICE_TIME','PRODUCT_COUNT','PIZZA_COUNT','AVERAGE_PIZZA_PRICE','DISCOUNT_RATIO','TOTAL_SENTIMENT_SCORE','AVERAGE_SENTIMENT_SCORE','STD_SENTIMENT_SCORE','SPEED','SERVING','FLAVOUR']])
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_restaurant.html was generated. You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_menu.html was generated. You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_product_image.html was generated. You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_comment.html was generated.
Feature: MINIMUM_AMOUNT |▌ | [ 6%] 00:00 -> (? left)
You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_service.html was generated. You can see the exploratory data analysis visualizations in your browser.
We have 1,189 distinct resturants, 131,131 distinct products with 1,077 distinct categories and 23,109 dstinct names only 11% of the products are discounted.
Our product prices are averaging around 41 with 30.4 standart deviation. We have around 5,112 product images.
In user reviews we have 271,843 comments which 221,312 of them are distinct in user scores most of the user scores are 10 with around 56-64% of the scores and the second most common score is 1 with around 10-13% of the scores.
Dataset does not contain any missing data except in user speed scores which contains less than 1% of the data with 3,495 rows.
if(not(check_if_file_exists('images/word_cloud_popular_words.png'))):
sql_command = """
WITH WORDS AS(
SELECT
regexp_split_to_table(regexp_replace("COMMENT_TEXT", '[^[:alpha:]]', '', 'g'), '\s+(\w\s+)*') AS "WORD"
FROM "EDW"."DWH_FB_COMMENT"
)
SELECT
"WORD",
COUNT(*) AS "WORD_COUNT"
FROM WORDS
WHERE LENGTH("WORD") > 1
GROUP BY WORDS."WORD"
ORDER BY COUNT(*) DESC
LIMIT 1000;
"""
df_word_cloud = pd.read_sql(sql_command,conn)
df_word_cloud['WORD'] = df_word_cloud['WORD'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
#df_word_cloud['WORD'] = df_word_cloud['WORD'].apply(normalize_text)
df_word_cloud['WORD'] = df_word_cloud['WORD'].apply(turkish_to_english_characters)
df_word_cloud.groupby(['WORD'], as_index=False)['WORD_COUNT'].sum()
word_cloud_data = dict(zip(df_word_cloud['WORD'].tolist(), df_word_cloud['WORD_COUNT'].tolist()))
wc = WordCloud(background_color='white', width=8000, height=4000, max_words=10000).generate_from_frequencies(word_cloud_data)
wc.to_file('images/word_cloud_popular_words.png')
Image.open('images/word_cloud_popular_words.png')
for label in ['SPEED', 'SERVING', 'FLAVOUR']:
distribution_plot(comment_df, label)
merge_images(['images/distribution_plot_comment_df_SPEED.png','images/distribution_plot_comment_df_SERVING.png','images/distribution_plot_comment_df_FLAVOUR.png'], background_color='WHITE')
for label in ['SPEED', 'SERVING', 'FLAVOUR']:
print(kstest_result(comment_df,label))
Kolmogorov-Smirnov test in ___ for Speed statistic: 0.8896, p value: 0.0000 Kolmogorov-Smirnov test in ___ for Serving statistic: 0.8606, p value: 0.0000 Kolmogorov-Smirnov test in ___ for Flavour statistic: 0.8562, p value: 0.0000
We are going to transform our reviews with CountVectorizer, process is similar to one-hot encoding which counts how many of these words appear in a label.
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_GaussianNB_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = GaussianNB()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 33.73% F1 Score for SPEED: 33.73% Precision Score for SPEED: 33.73% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 29.20% F1 Score for SERVING: 29.20% Precision Score for SERVING: 29.20% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 28.80% F1 Score for FLAVOUR: 28.80% Precision Score for FLAVOUR: 28.80% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SPEED: 47.57% F1 Score for ABOVE_AVERAGE_SPEED: 47.57% Precision Score for ABOVE_AVERAGE_SPEED: 47.57% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SERVING: 47.50% F1 Score for ABOVE_AVERAGE_SERVING: 47.50% Precision Score for ABOVE_AVERAGE_SERVING: 47.50% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 49.37% F1 Score for ABOVE_AVERAGE_FLAVOUR: 49.37% Precision Score for ABOVE_AVERAGE_FLAVOUR: 49.37%
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_SVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = SVC()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 71.87% F1 Score for SPEED: 71.87% Precision Score for SPEED: 71.87% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 67.30% F1 Score for SERVING: 67.30% Precision Score for SERVING: 67.30% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 64.90% F1 Score for FLAVOUR: 64.90% Precision Score for FLAVOUR: 64.90% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SPEED: 84.53% F1 Score for ABOVE_AVERAGE_SPEED: 84.53% Precision Score for ABOVE_AVERAGE_SPEED: 84.53% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SERVING: 83.63% F1 Score for ABOVE_AVERAGE_SERVING: 83.63% Precision Score for ABOVE_AVERAGE_SERVING: 83.63% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 85.93% F1 Score for ABOVE_AVERAGE_FLAVOUR: 85.93% Precision Score for ABOVE_AVERAGE_FLAVOUR: 85.93%
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_LinearSVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearSVC()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 68.13% F1 Score for SPEED: 68.13% Precision Score for SPEED: 68.13% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 64.20% F1 Score for SERVING: 64.20% Precision Score for SERVING: 64.20% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 63.60% F1 Score for FLAVOUR: 63.60% Precision Score for FLAVOUR: 63.60% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SPEED: 84.40% F1 Score for ABOVE_AVERAGE_SPEED: 84.40% Precision Score for ABOVE_AVERAGE_SPEED: 84.40% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SERVING: 83.23% F1 Score for ABOVE_AVERAGE_SERVING: 83.23% Precision Score for ABOVE_AVERAGE_SERVING: 83.23% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 87.53% F1 Score for ABOVE_AVERAGE_FLAVOUR: 87.53% Precision Score for ABOVE_AVERAGE_FLAVOUR: 87.53%
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_LinearRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearRegression(n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.2f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Total Features after vectorizing: 3526 Mean Squared Error for SPEED: 1495252546747191732994048.00 R2 Score for SPEED: -259351435300495553462272.00 Total Features after vectorizing: 3526 Mean Squared Error for SERVING: 1954283586295817564061696.00 R2 Score for SERVING: -271039388726782244945920.00 Total Features after vectorizing: 3526 Mean Squared Error for FLAVOUR: 2806207270090314824548352.00 R2 Score for FLAVOUR: -351137892913561798180864.00
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_DecisionTreeClassifier_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
params = {
'max_depth': [10,13,14,15],
'min_samples_split': [2,3,4]
}
gscv = GridSearchCV(DecisionTreeClassifier(), params, cv=5)
gscv.fit(features_train, labels_train)
print("Best parameters: {best_parameters}".format(best_parameters = gscv.best_params_))
model = gscv.best_estimator_
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 72.20% F1 Score for SPEED: 72.20% Precision Score for SPEED: 72.20% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 66.83% F1 Score for SERVING: 66.83% Precision Score for SERVING: 66.83% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 64.87% F1 Score for FLAVOUR: 64.87% Precision Score for FLAVOUR: 64.87% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SPEED: 85.07% F1 Score for ABOVE_AVERAGE_SPEED: 85.07% Precision Score for ABOVE_AVERAGE_SPEED: 85.07% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SERVING: 79.60% F1 Score for ABOVE_AVERAGE_SERVING: 79.60% Precision Score for ABOVE_AVERAGE_SERVING: 79.60% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 83.03% F1 Score for ABOVE_AVERAGE_FLAVOUR: 83.03% Precision Score for ABOVE_AVERAGE_FLAVOUR: 83.03%
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:9600][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_LogisticRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LogisticRegression(penalty='l2', C=1.2, n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3434 Accuracy Score for SPEED: 70.90% F1 Score for SPEED: 70.90% Precision Score for SPEED: 70.90% Total Features after vectorizing: 3434 Accuracy Score for SERVING: 67.33% F1 Score for SERVING: 67.33% Precision Score for SERVING: 67.33% Total Features after vectorizing: 3434 Accuracy Score for FLAVOUR: 65.73% F1 Score for FLAVOUR: 65.73% Precision Score for FLAVOUR: 65.73% Total Features after vectorizing: 3434 Accuracy Score for ABOVE_AVERAGE_SPEED: 85.59% F1 Score for ABOVE_AVERAGE_SPEED: 85.59% Precision Score for ABOVE_AVERAGE_SPEED: 85.59% Total Features after vectorizing: 3434 Accuracy Score for ABOVE_AVERAGE_SERVING: 83.99% F1 Score for ABOVE_AVERAGE_SERVING: 83.99% Precision Score for ABOVE_AVERAGE_SERVING: 83.99% Total Features after vectorizing: 3434 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 88.12% F1 Score for ABOVE_AVERAGE_FLAVOUR: 88.12% Precision Score for ABOVE_AVERAGE_FLAVOUR: 88.12%
We are going to transform our reviews with TF-IDF Vectorizer, process is similar to one-hot encoding which calculates how frequently these words appear in a label.
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_GaussianNB_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = GaussianNB()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 34.10% F1 Score for SPEED: 34.10% Precision Score for SPEED: 34.10% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 29.37% F1 Score for SERVING: 29.37% Precision Score for SERVING: 29.37% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 29.10% F1 Score for FLAVOUR: 29.10% Precision Score for FLAVOUR: 29.10% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SPEED: 49.03% F1 Score for ABOVE_AVERAGE_SPEED: 49.03% Precision Score for ABOVE_AVERAGE_SPEED: 49.03% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SERVING: 50.43% F1 Score for ABOVE_AVERAGE_SERVING: 50.43% Precision Score for ABOVE_AVERAGE_SERVING: 50.43% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 52.20% F1 Score for ABOVE_AVERAGE_FLAVOUR: 52.20% Precision Score for ABOVE_AVERAGE_FLAVOUR: 52.20%
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_SVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
params = {
'C': [1.5],
'kernel': ['rbf']
}
gscv = GridSearchCV(SVC(), params, cv=5)
gscv.fit(features_train, labels_train)
print("Best parameters: {best_parameters}".format(best_parameters = gscv.best_params_))
model = gscv.best_estimator_
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 72.87% F1 Score for SPEED: 72.87% Precision Score for SPEED: 72.87% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 68.50% F1 Score for SERVING: 68.50% Precision Score for SERVING: 68.50% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 66.83% F1 Score for FLAVOUR: 66.83% Precision Score for FLAVOUR: 66.83% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SPEED: 86.30% F1 Score for ABOVE_AVERAGE_SPEED: 86.30% Precision Score for ABOVE_AVERAGE_SPEED: 86.30% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SERVING: 85.33% F1 Score for ABOVE_AVERAGE_SERVING: 85.33% Precision Score for ABOVE_AVERAGE_SERVING: 85.33% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 88.67% F1 Score for ABOVE_AVERAGE_FLAVOUR: 88.67% Precision Score for ABOVE_AVERAGE_FLAVOUR: 88.67%
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_LinearSVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearSVC()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 70.63% F1 Score for SPEED: 70.63% Precision Score for SPEED: 70.63% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 66.77% F1 Score for SERVING: 66.77% Precision Score for SERVING: 66.77% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 66.30% F1 Score for FLAVOUR: 66.30% Precision Score for FLAVOUR: 66.30% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SPEED: 85.33% F1 Score for ABOVE_AVERAGE_SPEED: 85.33% Precision Score for ABOVE_AVERAGE_SPEED: 85.33% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_SERVING: 84.37% F1 Score for ABOVE_AVERAGE_SERVING: 84.37% Precision Score for ABOVE_AVERAGE_SERVING: 84.37% Total Features after vectorizing: 3526 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 88.63% F1 Score for ABOVE_AVERAGE_FLAVOUR: 88.63% Precision Score for ABOVE_AVERAGE_FLAVOUR: 88.63%
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:9600][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_DecisionTreeClassifier_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
params = {
'max_depth': [9,10,11],
'min_samples_split': [2,3]
}
gscv = GridSearchCV(DecisionTreeClassifier(), params, cv=5)
gscv.fit(features_train, labels_train)
print("Best parameters: {best_parameters}".format(best_parameters = gscv.best_params_))
model = gscv.best_estimator_
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3434 Accuracy Score for SPEED: 71.56% F1 Score for SPEED: 71.56% Precision Score for SPEED: 71.56% Total Features after vectorizing: 3434 Accuracy Score for SERVING: 66.70% F1 Score for SERVING: 66.70% Precision Score for SERVING: 66.70% Total Features after vectorizing: 3434 Accuracy Score for FLAVOUR: 64.44% F1 Score for FLAVOUR: 64.44% Precision Score for FLAVOUR: 64.44% Total Features after vectorizing: 3434 Accuracy Score for ABOVE_AVERAGE_SPEED: 84.55% F1 Score for ABOVE_AVERAGE_SPEED: 84.55% Precision Score for ABOVE_AVERAGE_SPEED: 84.55% Total Features after vectorizing: 3434 Accuracy Score for ABOVE_AVERAGE_SERVING: 79.41% F1 Score for ABOVE_AVERAGE_SERVING: 79.41% Precision Score for ABOVE_AVERAGE_SERVING: 79.41% Total Features after vectorizing: 3434 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 82.85% F1 Score for ABOVE_AVERAGE_FLAVOUR: 82.85% Precision Score for ABOVE_AVERAGE_FLAVOUR: 82.85%
for label in ['SPEED','SERVING','FLAVOUR','ABOVE_AVERAGE_SPEED','ABOVE_AVERAGE_SERVING','ABOVE_AVERAGE_FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:9600][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_LogisticRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LogisticRegression(penalty='l2', C=1.2, n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3434 Accuracy Score for SPEED: 72.64% F1 Score for SPEED: 72.64% Precision Score for SPEED: 72.64% Total Features after vectorizing: 3434 Accuracy Score for SERVING: 68.61% F1 Score for SERVING: 68.61% Precision Score for SERVING: 68.61% Total Features after vectorizing: 3434 Accuracy Score for FLAVOUR: 66.63% F1 Score for FLAVOUR: 66.63% Precision Score for FLAVOUR: 66.63% Total Features after vectorizing: 3434 Accuracy Score for ABOVE_AVERAGE_SPEED: 85.76% F1 Score for ABOVE_AVERAGE_SPEED: 85.76% Precision Score for ABOVE_AVERAGE_SPEED: 85.76% Total Features after vectorizing: 3434 Accuracy Score for ABOVE_AVERAGE_SERVING: 84.03% F1 Score for ABOVE_AVERAGE_SERVING: 84.03% Precision Score for ABOVE_AVERAGE_SERVING: 84.03% Total Features after vectorizing: 3434 Accuracy Score for ABOVE_AVERAGE_FLAVOUR: 87.74% F1 Score for ABOVE_AVERAGE_FLAVOUR: 87.74% Precision Score for ABOVE_AVERAGE_FLAVOUR: 87.74%
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_LinearRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearRegression(n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.2f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.68 R2 Score for SPEED: -0.01 Mean Squared Error for SERVING: 0.99 R2 Score for SERVING: 0.00 Mean Squared Error for FLAVOUR: 1.01 R2 Score for FLAVOUR: 0.01
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_LinearRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearRegression(n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.2f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Total Features after vectorizing: 3526 Mean Squared Error for SPEED: 314881175242905989802885120.00 R2 Score for SPEED: -54616114800145500106391552.00 Total Features after vectorizing: 3526 Mean Squared Error for SERVING: 32168932824251959961714688.00 R2 Score for SERVING: -4461505970688933851299840.00 Total Features after vectorizing: 3526 Mean Squared Error for FLAVOUR: 74430105557697276554510336.00 R2 Score for FLAVOUR: -9313364238423725143228416.00
for label in ['SPEED','SERVING','FLAVOUR']:
feature_columns = ['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']
model_name = "LGBMRegressor"
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[feature_columns].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_{model_name}_{label}.mdl".format(label = label, model_name=model_name)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LGBMRegressor(boosting_type='goss', n_jobs=-1, n_estimators=100, num_leaves=31, max_depth=-1, learning_rate=0.05, reg_alpha=0.0, reg_lambda=0.0)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
feature_importance_plot(label, model_name, feature_columns, model.feature_importances_)
merge_images(['images/SPEED.png', 'images/SERVING.png', 'images/FLAVOUR.png'])
Mean Squared Error for SPEED: 0.38 R2 Score for SPEED: 0.4395 Mean Squared Error for SERVING: 0.52 R2 Score for SERVING: 0.4737 Mean Squared Error for FLAVOUR: 0.55 R2 Score for FLAVOUR: 0.4598
for label in ['SPEED','SERVING','FLAVOUR']:
feature_columns = ['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME','EXPECTED_SERVICE_TIME','STD_SERVICE_TIME','COV_SERVICE_TIME','PRODUCT_COUNT','PIZZA_COUNT','AVERAGE_PIZZA_PRICE','DISCOUNT_RATIO']
model_name = "LGBMRegressor"
not_null_df = service_df[pd.notnull(service_df[label])]
features = not_null_df[feature_columns].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_Statistical_{model_name}_{label}.mdl".format(label = label, model_name=model_name)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LGBMRegressor(boosting_type='goss', n_jobs=-1, n_estimators=100, num_leaves=31, max_depth=-1, learning_rate=0.05, reg_alpha=0.0, reg_lambda=0.0)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
feature_importance_plot(label, model_name, feature_columns, model.feature_importances_)
merge_images(['images/SPEED.png', 'images/SERVING.png', 'images/FLAVOUR.png'], background_color='WHITE')
Mean Squared Error for SPEED: 0.35 R2 Score for SPEED: 0.4813 Mean Squared Error for SERVING: 0.48 R2 Score for SERVING: 0.5180 Mean Squared Error for FLAVOUR: 0.50 R2 Score for FLAVOUR: 0.5131
for label in ['SPEED','SERVING','FLAVOUR']:
feature_columns = ['MINIMUM_AMOUNT', 'MINIMUM_SERVICE_TIME', 'MAXIMUM_SERVICE_TIME', 'EXPECTED_SERVICE_TIME', 'STD_SERVICE_TIME', 'COV_SERVICE_TIME', 'PRODUCT_COUNT', 'PIZZA_COUNT', 'AVERAGE_PIZZA_PRICE', 'DISCOUNT_RATIO', 'TOTAL_SENTIMENT_SCORE', 'AVERAGE_SENTIMENT_SCORE', 'STD_SENTIMENT_SCORE']
model_name = "LGBMRegressor"
not_null_df = service_df[pd.notnull(service_df[label])]
features = not_null_df[feature_columns].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_Sentiment_{model_name}_{label}.mdl".format(label = label, model_name=model_name)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LGBMRegressor(boosting_type='goss', n_jobs=-1, n_estimators=100, num_leaves=31, max_depth=-1, learning_rate=0.05, reg_alpha=0.0, reg_lambda=0.0)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
feature_importance_plot(label, model_name, feature_columns, model.feature_importances_)
shap_values = shap.TreeExplainer(model).shap_values(features_test, y=np.array(feature_columns))
shap.summary_plot(shap_values, features_test, plot_type="bar", feature_names=feature_columns, show=False)
plt.tight_layout()
plt.autoscale()
plt.savefig('images/SHAP_{label}.png'.format(label=label))
plt.close()
merge_images([merge_images(['images/SPEED.png', 'images/SERVING.png', 'images/FLAVOUR.png'], background_color='WHITE'), merge_images(['images/SHAP_SPEED.png', 'images/SHAP_SERVING.png', 'images/SHAP_FLAVOUR.png'], background_color='WHITE')], orientation='vertical')
Mean Squared Error for SPEED: 0.18 R2 Score for SPEED: 0.7310 Mean Squared Error for SERVING: 0.22 R2 Score for SERVING: 0.7773 Mean Squared Error for FLAVOUR: 0.23 R2 Score for FLAVOUR: 0.7778
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_XGBRegressor_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = XGBRegressor()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.36 R2 Score for SPEED: 0.4697 Mean Squared Error for SERVING: 0.49 R2 Score for SERVING: 0.5016 Mean Squared Error for FLAVOUR: 0.51 R2 Score for FLAVOUR: 0.4996
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_SGDRegressor_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = SGDRegressor(alpha=0.0001)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 764.54 R2 Score for SPEED: -1135.3673 Mean Squared Error for SERVING: 2323.84 R2 Score for SERVING: -2348.4848 Mean Squared Error for FLAVOUR: 827104.79 R2 Score for FLAVOUR: -813413.6787
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_KernelRidge_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = KernelRidge(kernel='polynomial', alpha=1, degree=5, coef0=1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.54 R2 Score for SPEED: 0.1988 Mean Squared Error for SERVING: 0.78 R2 Score for SERVING: 0.2133 Mean Squared Error for FLAVOUR: 0.81 R2 Score for FLAVOUR: 0.2021
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_ElasticNet_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = ElasticNet(normalize=True)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.67 R2 Score for SPEED: -0.0031 Mean Squared Error for SERVING: 0.99 R2 Score for SERVING: -0.0005 Mean Squared Error for FLAVOUR: 1.02 R2 Score for FLAVOUR: -0.0004
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_BayesianRidge_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = BayesianRidge(normalize=True)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.67 R2 Score for SPEED: 0.0036 Mean Squared Error for SERVING: 0.98 R2 Score for SERVING: 0.0070 Mean Squared Error for FLAVOUR: 1.01 R2 Score for FLAVOUR: 0.0067
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_GradientBoostingRegressor_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = GradientBoostingRegressor(loss='ls', n_estimators=50, min_samples_split=20, min_samples_leaf=5, max_depth=3)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.37 R2 Score for SPEED: 0.4478 Mean Squared Error for SERVING: 0.51 R2 Score for SERVING: 0.4811 Mean Squared Error for FLAVOUR: 0.53 R2 Score for FLAVOUR: 0.4817
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_SVR_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = SVR(kernel='rbf')
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.55 R2 Score for SPEED: 0.1761 Mean Squared Error for SERVING: 0.80 R2 Score for SERVING: 0.1872 Mean Squared Error for FLAVOUR: 0.84 R2 Score for FLAVOUR: 0.1771
sql_command = """
WITH EFM AS(
SELECT DISTINCT
EFM."CATEGORY_NAME",
EFM."PRODUCT_NAME",
EFM."PRODUCT_DESCRIPTION",
EFM."PRODUCT_LISTED_PRICE",
EFM."PRODUCT_PRICE",
EFM."DISCOUNT"
FROM "EDW"."DWH_FB_MENU" EFM
WHERE 1=1
AND CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END = 1
AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%soslar%' AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%yan %'
),
QUARTILES AS(
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q1,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q2,
PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q3
FROM EFM
),
OUTLIER_RANGES AS(
SELECT
QR.Q1,
QR.Q2,
QR.Q3,
QR.Q3-QR.Q1 AS IQR,
QR.Q1-(1.5*(QR.Q3-QR.Q1)) AS MINIMA,
QR.Q3+(1.5*(QR.Q3-QR.Q1)) AS MAXIMA
FROM QUARTILES QR
),
PIZZAS AS(
SELECT *
FROM EFM
INNER JOIN OUTLIER_RANGES ORA ON (1=1)
WHERE EFM."PRODUCT_LISTED_PRICE" BETWEEN ORA.MINIMA AND ORA.MAXIMA
ORDER BY EFM."PRODUCT_LISTED_PRICE"
)
SELECT
EFM.*,
PIZ.Q1,
PIZ.Q2,
PIZ.Q3,
PIZ.IQR,
PIZ.MINIMA,
PIZ.MAXIMA
FROM "EDW"."DWH_FB_MENU" EFM
INNER JOIN PIZZAS PIZ ON (EFM."CATEGORY_NAME" = PIZ."CATEGORY_NAME" AND EFM."PRODUCT_NAME" = PIZ."PRODUCT_NAME" AND EFM."PRODUCT_DESCRIPTION" = PIZ."PRODUCT_DESCRIPTION" AND EFM."PRODUCT_LISTED_PRICE" = PIZ."PRODUCT_LISTED_PRICE");
"""
menu_df = pd.read_sql(sql_command,conn)
menu_df.head()
| PRODUCT_ID | RESTAURANT_ID | CATEGORY_NAME | PRODUCT_NAME | PRODUCT_DESCRIPTION | PRODUCT_LISTED_PRICE | PRODUCT_PRICE | DISCOUNT | DESIGN_TYPE | DATE | q1 | q2 | q3 | iqr | minima | maxima | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4F9A1222-2174-4794-85BE-78B864137B58 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Vodafone Menüleri | Vodafone Menüsü (Pizza Refanzine (32 cm.)) | Pizza Refanzine (32 cm.) + Coca-Cola (33 cl.) ... | 52.0 | 52.0 | False | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 1 | 8b503b63-b162-4b43-a58c-301a41c6b98a | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Rucola (25 cm.) | Dante sos, mozzarella peyniri, roka, cherry do... | 66.0 | 33.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 2 | 15cff837-c4f5-4ded-b09e-dd8c6fd5e591 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Funghi (25 cm.) | Dante sos, mozzarella peyniri, kültür mantarı | 58.0 | 29.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 3 | ef08f896-366b-40b0-9858-6cdc69401f41 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Quattro Formaggi (25 cm.) | Dante sos, mozzarella peyniri, parmesan peynir... | 76.0 | 38.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 4 | e7e2b046-6d7b-4636-bcd7-59be32110798 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Mediterraneo (25 cm.) | Dante sos, mozzarella peyniri, beyaz peynir, y... | 66.0 | 33.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
sql_command = """
WITH EFM AS(
SELECT DISTINCT
EFM."CATEGORY_NAME",
EFM."PRODUCT_NAME",
EFM."PRODUCT_DESCRIPTION",
EFM."PRODUCT_LISTED_PRICE",
EFM."PRODUCT_PRICE",
EFM."DISCOUNT"
FROM "EDW"."DWH_FB_MENU" EFM
WHERE 1=1
AND CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END = 1
AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%soslar%' AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%yan %'
),
QUARTILES AS(
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q1,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q2,
PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q3
FROM EFM
),
OUTLIER_RANGES AS(
SELECT
QR.Q1,
QR.Q2,
QR.Q3,
QR.Q3-QR.Q1 AS IQR,
QR.Q1-(1.5*(QR.Q3-QR.Q1)) AS MINIMA,
QR.Q3+(1.5*(QR.Q3-QR.Q1)) AS MAXIMA
FROM QUARTILES QR
),
PIZZAS AS(
SELECT *
FROM EFM
INNER JOIN OUTLIER_RANGES ORA ON (1=1)
WHERE EFM."PRODUCT_LISTED_PRICE" BETWEEN ORA.MINIMA AND ORA.MAXIMA
ORDER BY EFM."PRODUCT_LISTED_PRICE"
)
SELECT
EFM."RESTAURANT_ID",
AVG(EFM."PRODUCT_LISTED_PRICE") AS "AVERAGE_PRODUCT_LISTED_PRICE",
COUNT(*) AS "NUMBER_OF_PRODUCTS"
FROM "EDW"."DWH_FB_MENU" EFM
INNER JOIN PIZZAS PIZ ON (EFM."CATEGORY_NAME" = PIZ."CATEGORY_NAME" AND EFM."PRODUCT_NAME" = PIZ."PRODUCT_NAME" AND EFM."PRODUCT_DESCRIPTION" = PIZ."PRODUCT_DESCRIPTION" AND EFM."PRODUCT_LISTED_PRICE" = PIZ."PRODUCT_LISTED_PRICE")
GROUP BY EFM."RESTAURANT_ID";
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df.head()
| RESTAURANT_ID | AVERAGE_PRODUCT_LISTED_PRICE | NUMBER_OF_PRODUCTS | |
|---|---|---|---|
| 0 | 1991-tarihi-ortakoy-kumpircisi-silivri-yeni-ma... | 41.600000 | 10 |
| 1 | 400-c-pizza-sisli-tesvikiye-mah-istanbul | 62.042553 | 47 |
| 2 | 400-c-pizza-umraniye-serifali-mah-istanbul | 63.476190 | 42 |
| 3 | 444-pizza-bagcilar-demirkapi-mah-istanbul | 43.163793 | 58 |
| 4 | 444-pizza-eyup-emniyettepe-mah-istanbul | 43.818462 | 65 |
if not(check_if_file_exists('images/kmeans_product_cluster.png')):
model = KMeans(n_clusters=5).fit(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
labels = model.predict(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
plt.scatter(menu_df["PRODUCT_LISTED_PRICE"].values,menu_df["PRODUCT_PRICE"].values,c=labels, cmap='viridis');
plt.xlabel("Listed Price")
plt.ylabel("Price with Discount")
plt.title('K-Means Clustering on Products')
plt.savefig('images/kmeans_product_cluster.png')
plt.close()
Image.open('images/kmeans_product_cluster.png')
if not(check_if_file_exists('images/kmeans_restaurant_cluster.png')):
model = KMeans(n_clusters=5).fit(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(restaurant_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,restaurant_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('K-Means Clustering on Restaurants')
plt.savefig('images/kmeans_restaurant_cluster.png')
plt.close()
Image.open('images/kmeans_restaurant_cluster.png')
if not(check_if_file_exists('images/kmeans_log_restaurant_cluster.png')):
temp_df = np.log(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]]+1)
model = KMeans(n_clusters=5).fit(temp_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(temp_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(temp_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,temp_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('K-Means Clustering on Restaurants with Log Transformation')
plt.savefig('images/kmeans_log_restaurant_cluster.png')
plt.close()
Image.open('images/kmeans_log_restaurant_cluster.png')
if not(check_if_file_exists('images/gmm_product_cluster.png')):
model = mixture.GaussianMixture(n_components=5).fit(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
labels = model.predict(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
plt.scatter(menu_df["PRODUCT_LISTED_PRICE"].values,menu_df["PRODUCT_PRICE"].values,c=labels, cmap='viridis');
plt.xlabel("Listed Price")
plt.ylabel("Price with Discount")
plt.title('GMM Clustering on Products')
plt.savefig('images/gmm_product_cluster.png')
plt.close()
Image.open('images/gmm_product_cluster.png')
if not(check_if_file_exists('images/gmm_restaurant_cluster.png')):
model = mixture.GaussianMixture(n_components=5).fit(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(restaurant_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,restaurant_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('GMM Clustering on Restaurants')
plt.savefig('images/gmm_restaurant_cluster.png')
plt.close()
Image.open('images/gmm_restaurant_cluster.png')
if not(check_if_file_exists('images/gmm_log_restaurant_cluster.png')):
temp_df = np.log(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]]+1)
model = mixture.GaussianMixture(n_components=5).fit(temp_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(temp_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(temp_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,temp_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('GMM Clustering on Restaurants with Log Transformation')
plt.savefig('images/gmm_log_restaurant_cluster.png')
plt.close()
Image.open('images/gmm_log_restaurant_cluster.png')
Markov chains calculate how frequently words follow each other and generate new text depending on the first words we are given. These models can be used to create fake reviews.
sql_command = """
WITH COMMENTS AS(
SELECT "RESTAURANT_ID",
STRING_AGG(LOWER("COMMENT_TEXT"), ' ') AS ALL_COMMENT,
COUNT("COMMENT_TEXT") AS COMMENT_COUNT
FROM "ODS"."EXT_FB_COMMENT" EFC
GROUP BY "RESTAURANT_ID"
)
SELECT *
FROM COMMENTS
ORDER BY COMMENT_COUNT DESC;
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df.head()
| RESTAURANT_ID | all_comment | comment_count | |
|---|---|---|---|
| 0 | pizza-bulls-uskudar-altunizade-mah-istanbul | lezzeti eskisi gibi gelmedi bize. fark ödeyip ... | 2293 |
| 1 | pizza-bulls-uskudar-ferah-mah-istanbul | her şey çok güzeldi. elinize sağlık. hizli ve ... | 1764 |
| 2 | pizza-bulls-kartal-soganlik-yeni-mah-istanbul | cok pahali olmasi disinda bir problem yok gibi... | 1743 |
| 3 | pizza-bulls-umraniye-cakmak-mah-istanbul | çok lezzetli güvenle çoçuguma yedirebiliyorum ... | 1582 |
| 4 | pizza-bulls-atasehir-fetih-mah-istanbul | servis ve hız 10 üzerinden 20.. bu siparişi is... | 1277 |
import random
def make_markov_model(cleaned_stories, n_gram=2):
markov_model = {}
for i in range(len(cleaned_stories)-n_gram-1):
curr_state, next_state = "", ""
for j in range(n_gram):
curr_state += cleaned_stories[i+j] + " "
next_state += cleaned_stories[i+j+n_gram] + " "
curr_state = curr_state[:-1]
next_state = next_state[:-1]
if curr_state not in markov_model:
markov_model[curr_state] = {}
markov_model[curr_state][next_state] = 1
else:
if next_state in markov_model[curr_state]:
markov_model[curr_state][next_state] += 1
else:
markov_model[curr_state][next_state] = 1
# calculating transition probabilities
for curr_state, transition in markov_model.items():
total = sum(transition.values())
for state, count in transition.items():
markov_model[curr_state][state] = count/total
return markov_model
def generate_story(markov_model, limit=100, start='my god'):
n = 0
curr_state = start
next_state = None
story = ""
story+=curr_state+" "
while n<limit:
next_state = random.choices(list(markov_model[curr_state].keys()),
list(markov_model[curr_state].values()))
curr_state = next_state[0]
story+=curr_state+" "
n+=1
return story
restaurant_df['clean_comment'] = restaurant_df['all_comment'].apply(clean_text)
text = restaurant_df.loc[0]['clean_comment'].split()
markov_model = make_markov_model(text)
for i in range(20):
print(str(i)+". ", generate_story(markov_model, start="yemek çok", limit=20))
0. yemek çok lezzetliydi ve getiren arkadasta cok hizli geldi ve yanında ikram bile vardı kardesim hayatinda yedigi en iyi pizza cok lezzetliydi ozellikle ince hamur ve ici peynirli kenar kombinasyonunu cok begendim ellerinize saglik bilal bey bilal bey isteyerek olmadı siparişimizde profitrol 1. yemek çok lezzetliydi ve kurye de çok nazikti her şey çok iyiydi pizzanın atakan beye teşekkürler her şey çok güzeldi keşke her şube bu sefer lezzetli değildi bu yüzden bir puanı kırdım maalesef şimdiye kadar gelen en hızlı servis ve hız gerçekten 2. yemek çok lezzetliydi ve sıcak geldi ve gayet lezzetli gercekten hızlı ve sıcak ürün için barbekü sos gelmedi not kısmında yazdığıma reğman it was delicious and the service is nice tüm çalışanların emeğine ellerine sağlık ayrıca gönderdikleri sufle harikaydı mükemmel lezzet ve 3. yemek çok güzeldi semih beye çok teşekkür ediyorum harikasınız pizzalar gayet lezzetliydi tavsiye ederim pizza soğuk geldi pizza bullsun acik ara en iyi subesi ayrica bilal beye ilgisinden dolayı çok teşekkür ederim selim bey arayıp pizza ile ilgili geri bildirim almaya özen 4. yemek çok lezzetliydi ve restoran bana çok yakın olmamasına rağmen sıcaktı da soğan halkaları da tam tersi az pişmişti ama pizzanın lezzeti malzeme kalitesi ve malzeme kalitesi iyiydi sufle ikramı için çayrıca teşekkür ederim kurye de geç getirmedi ama nasıl böyle soguk 5. yemek çok lezzetliydi ve kurye de çoook efendi bravo eskisi gibi iyi bir seçim oldu burak arkadaşımıza da teşekkürler her zamanki gibi muhteşemdi ellerinize sağlık ikramlar için de teşekkür ederiz çok lezzetliydi i̇kram için atakan beye de hızından dolayı ayrıca teşekkür ederim 6. yemek çok güzeldi semih beye teşekkürler 10 10 101010 33 luk icecek kalmayinca litrelik gondermisler sagolsunlar ayrica kurye burak beye teşekkürler her şey çok güzeldi fakat malzemesi bu sefer az geldi her şey çok güzel tatlı ikramıyla gönülleri fethetti her zamanki gibi 7. yemek çok güzeldi semih bey harikasınız sıcak geldi pizzalar başarılı i̇kram olarak gönderdiğiniz suffle için ayrıca teşekkürler her zamanki gibi mükemmel ayrıca sufle hediyesi için teşekkürler semih ve ali beye nezaketi ve hizmet kalitesinden teşekkür ederiyorum bilal beye ilgilerinden dolayı müteşikkiriz sizden 8. yemek çok lezzetliydi ve restoran bana çok yakın olmamasına rağmen sıcaktı da soğan halkaları için de 11 puanı buradan veriyorum pizzayı yerken ağzım yandı adamsınız adam rıdvan bey e teşekkürler atakan beye teşekkür ederim saygıllı dürüst bir arkadaş teşekkür ederiz lezzetli ve 9. yemek çok güzeldi semih beye çok teşekkürler herşey mükemmel kesinlikle 1 numara uzun yıllardır severek sipariş verdiğim yer pizzaları cidden çok kaliteli ve güzeldi yine çok lezzetliydi atakan beye tatlı ikramları için atakan beye ilgisi ve özverili çalışmaları şahane helal valla bütün 10. yemek çok güzeldi semih beyin özel ilgisi için teşekkür ederim pizzalara gelince sıcak ve lezzetli geldi bilal bey diye bir arkadaş var mükemmel birisi hem yardımcı oldu hem de ikram için teşşekür ederiz pizzalar cok kaliteli ve lezzetli kuryeler hep güler yüzlü 11. yemek çok lezzetliydi ve kurye de çalışanlar da bir o kadar ilgili ve nazikler sufle ikram eden semih bey ile getiren çok kibar kurye arkadaş ali gayet kibardı pizza beklediğimden lezzetliydi teşekkür ederiz deniz ürünlü pizzası bu bölgenin en iyisiydi altunizade şubenize 12. yemek çok lezzetliydi ve hızlıydı tesekkurler çok hızlı getirdi ve adres çok tarif edilemez bir yerde ol asına rağmen konumu buldu lezzet desen mükemmel ilgi desen 10 numara 10 yıldız pizzalar tam istedigimiz gibiydi ayrica atakan beyr sufle ikrami icin tesekkurler tavuk 13. yemek çok güzeldi semih beye çok teşekkürler ilgisi için keep up the good work d muhteşem hızlı taze ve sıcak geldi pizzalarımız ve sufleler elinize saglik teşekkürler fasfasf gayet başarılı fakat ketçap ve mayonez olmasnı bekledim sipariş yanlışlığını ikramla telafi ettiler teşekkür 14. yemek çok lezzetliydi ve her şey harika i̇kram için atakan beye çok teşekkürler arayıp bizzat ilgilendiler lezzet hız ve sunum çok iyi şimdiye kadar hiç pişman olmadım şaşırtmadı her zaman sever siparis veriyorum benim favorim missisipi qalite rakiplerine göre gayet başarılı tercih 15. yemek çok lezzetliydi ve hızlı ulaştı he zamanki gibi cumali beye ayrıca hızlı ve her zaman olduğu gibi yine sıcak ve lezzetliydi servisi de çok beğendim kuryeniz çok güler yüzlü oluyor her zaman yanında sufle ya da hediye patates getiriyorlar pizza için 16. yemek çok güzeldi semih beye çok teşekkürler çok güleryüzlüydü her şey çok güzeldi i̇kram sufle ve sos ikraminiz icin tesekkurler cok guzeldi sufle de göndermişler çok düşünceli bir hareketti üsküdar çevresinde kesinlikle tercih etmeniz gereken bir konu personelimiz bilal bey saolsun çok 17. yemek çok güzeldi semih beyin özel ilgisi için servis hızlıydı sıcak sıcak tekrar getirdiler kaliteli hızlı ve lezzetli çalışmanız çok harika ellerinize sağlık sıcak ve lezizdi her zaman tercih edilecek güzel bir pizzaydı hızlı ve kaliteli servisi içşn teşekkür ederiz merhaba kenar 18. yemek çok güzeldi semih beye ilgisinden dolay çok teşekkürler atakan beye siparişimi hızlı getirdiği için teşekkür ederim her zamanki gibi hızlı ve çok nazik ibrahim beye çok teşekkürler arayıp bizzat ilgilendiler lezzet hız ve servis bilal bey muhteşem bir hizmet verdi bilal 19. yemek çok güzeldi semih beyin özel ilgisi için ayrıca teşekkür ederim tatlı ikramları için atakan beye ekstradan elma dilim patates yolladığı için teşekkür ediyorum kalite de sürekli olmanız sevindirici kalite bozmadan devam edin lütfen semih bey harikasınız sıcak pizzalarımız için çok teşekkür
Convolutional neural networks are used for image classification purposes. For our labels, we are considering if a product name, description, or product category contains the word pizza in it or not.
sql_command = """
SELECT
DISTINCT
FPI."IMAGE_LINK",
FPIS."SOURCE",
CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END AS "IS_PIZZA"
FROM "ODS"."EXT_FB_MENU" EFM
INNER JOIN "ODS"."EXT_FB_PRODUCT_IMAGE" FPI ON (EFM."PRODUCT_ID" = FPI."PRODUCT_ID")
INNER JOIN "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE" FPIS ON (FPI."IMAGE_LINK" = FPIS."IMAGE_LINK");
"""
image_source_df = pd.read_sql(sql_command,conn)
image_source_df
| IMAGE_LINK | SOURCE | IS_PIZZA | |
|---|---|---|---|
| 0 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 1 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 2 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 3 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 4 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| ... | ... | ... | ... |
| 4961 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 1 |
| 4962 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 4963 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 1 |
| 4964 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 1 |
| 4965 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
4966 rows × 3 columns
features = []
labels = []
for i in range(len(image_source_df)):
image_source = byte_image_to_numpy_array(image_source_df.iloc[i]["SOURCE"], 200)
is_pizza = image_source_df.iloc[i]["IS_PIZZA"]
features.append(image_source)
labels.append(is_pizza)
features = np.array(features).reshape(-1, 200, 200, 3)
features = features.astype('float32')/255
labels = np_utils.to_categorical(labels, 2)
batch_size = 16
epochs = 3
print("Feature Shape: {f_shape} Label Shape: {l_shape}".format(f_shape=features.shape, l_shape=labels.shape))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, (3,3), padding='same', activation=tf.nn.relu,
input_shape=(200, 200, 3)),
tf.keras.layers.MaxPooling2D((2, 2), strides=2),
tf.keras.layers.Conv2D(32, (3,3), padding='same', activation=tf.nn.relu),
tf.keras.layers.MaxPooling2D((2, 2), strides=2),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation=tf.nn.relu),
tf.keras.layers.Dense(2, activation=tf.nn.softmax)
])
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(features_train, labels_train, batch_size = batch_size, epochs = epochs, verbose = 1, validation_data = (features_test, labels_test))
label_prediction = model.predict(features_test)
score = model.evaluate(features_test, labels_test, verbose = 0)
print("""Loss: {loss_score:0.4f} Accuracy: {accuracy:0.2f}%""".format(loss_score=score[0], accuracy=score[1]*100))
Feature Shape: (4966, 200, 200, 3) Label Shape: (4966, 2) Epoch 1/3 218/218 [==============================] - 94s 429ms/step - loss: 0.7465 - accuracy: 0.6815 - val_loss: 0.4465 - val_accuracy: 0.8034 Epoch 2/3 218/218 [==============================] - 91s 417ms/step - loss: 0.4163 - accuracy: 0.8052 - val_loss: 0.5169 - val_accuracy: 0.7537 Epoch 3/3 218/218 [==============================] - 92s 423ms/step - loss: 0.3423 - accuracy: 0.8562 - val_loss: 0.2970 - val_accuracy: 0.8812 Loss: 0.2970 Accuracy: 88.12%
if(not(check_if_file_exists('images/cnn_tp.png') and check_if_file_exists('images/cnn_tn.png') and check_if_file_exists('images/cnn_fp.png') and check_if_file_exists('images/cnn_fn.png'))):
tp, tn, fp, fn = False, False, False, False
for test_row in range(len(labels_test)):
true_label = labels_test[test_row][1].astype(np.uint8)
prediction = round(label_prediction[test_row][1])
if(true_label == 1 and prediction == 1 and tp == False):
tp = test_row
plt.title('True Positive',fontweight='bold',fontsize=15)
plt.imshow((features_test[tp]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_tp.png')
plt.close()
continue
if(true_label == 0 and prediction == 0 and tn == False):
tn = test_row
plt.title('True Negative',fontweight='bold',fontsize=15)
plt.imshow((features_test[tn]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_tn.png')
plt.close()
continue
if(true_label == 0 and prediction == 1 and fp == False):
fp = test_row
plt.title('False Positive',fontweight='bold',fontsize=15)
plt.imshow((features_test[fp]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_fp.png')
plt.close()
continue
if(true_label == 1 and prediction == 0 and fn == False):
fn = test_row
plt.title('False Negative',fontweight='bold',fontsize=15)
plt.imshow((features_test[fn]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_fn.png')
plt.close()
continue
merge_images([merge_images(['images/cnn_tp.png', 'images/cnn_tn.png']),merge_images(['images/cnn_fp.png', 'images/cnn_fn.png'])],orientation='vertical')